Plasma only samples
import pandas as pd
samples = pd.read_csv('data/sample_sheet.csv')
# must be Healthy Control Study and must pass MiSeq QC
samples = samples.loc[(samples['Study'] == 'Healthy Controls') & (samples['MISEQ.QC.PASS'] == 'PASS')]
samples = samples.set_index('MT.Unique.ID').sort_values(by=['Participant.ID', 'Source'])
# capitalize & strip whitespace for consistency
for column in ['Gender', 'Race', 'Source']:
samples[column] = samples[column].str.capitalize()
samples[column] = samples[column].str.strip()
# use correct ontology terms
race_ontology = {'Asian': 'Asian',
'Black or african american': 'African_American',
'Mixed/asian & white': 'Multiracial',
'Mixed/asian &black': 'Multiracial',
'Mixed/black, white, asian': 'Multiracial',
'Native hawiian or other pacific islander': 'Pacific_Islander',
'Pacific islander': 'Pacific_Islander',
'White': 'White'}
for id in samples.index:
race = samples.at[id, 'Race']
samples.at[id, 'Race'] = race_ontology[race] if race in race_ontology else 'Multiracial'
# get only the plasma samples
mir_counts = pd.read_csv("data/get_canonical/canon_mir_counts.csv", index_col=0)
samples.index = pd.Index(['X' + str(row) for row in samples.index])
plasma_samples = samples.loc[samples['Source'] == "Plasma"]
plasma_mir_counts = mir_counts[plasma_samples.index]
plasma_samples.to_csv('data/plasma_samples.csv')
plasma_mir_counts.to_csv('data/plasma_mir_counts.csv')
Load the sample data and miRNA counts
samples <- read.csv('data/plasma_samples.csv')
counts <- read.csv('data/plasma_mir_counts.csv')
samples <- subset(samples, Library.Generation.Set != "SetRecheck" ) # exclude SetRecheck samples
samples <- samples[samples$X != "X11" & samples$X != "X92",] # remove outliers
samples$Participant.ID <- factor(samples$Participant.ID) # ID is categorical, not numerical
rownames(counts) = counts$X
rownames(samples) = samples$X
counts$X <- NULL # remove extra column
samples$X <- NULL
counts <- counts[,unique(rownames(samples))]
head(samples)
head(counts)
Filter
library(edgeR)
design <- model.matrix(~0+Race+Gender+Age, samples)
dge = DGEList(counts = counts, samples = samples)
# require miRNAs to have CPM > 1 in at least 2 samples
countsPerMillion <- edgeR::cpm(dge)
countCheck <- countsPerMillion > 1
head(countCheck)
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X25
hsa-let-7a-3p TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE
hsa-let-7a-5p TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
hsa-let-7b-3p TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
hsa-let-7b-5p TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
hsa-let-7c-3p FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
hsa-let-7c-5p TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
X26 X27 X28 X29 X30 X31 X32 X33 X34 X35 X36 X37 X38 X39 X40 X41 X42 X43 X44 X45 X46 X48
hsa-let-7a-3p TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
hsa-let-7a-5p TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
hsa-let-7b-3p TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
hsa-let-7b-5p TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
hsa-let-7c-3p FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
hsa-let-7c-5p TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
X49 X50 X52 X53 X54 X55 X56 X57 X58 X59 X60 X61 X62 X63 X64 X65 X66 X67 X68 X69 X76 X77
hsa-let-7a-3p TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
hsa-let-7a-5p TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
hsa-let-7b-3p TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
hsa-let-7b-5p TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
hsa-let-7c-3p FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
hsa-let-7c-5p TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
X79 X80 X81 X82 X83 X84 X85 X86 X87 X88 X89 X90 X91 X93 X96 X99 X100 X102 X103 X104 X105 X106
hsa-let-7a-3p TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
hsa-let-7a-5p TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
hsa-let-7b-3p TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
hsa-let-7b-5p TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
hsa-let-7c-3p FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
hsa-let-7c-5p TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
keep <- which(rowSums(countCheck) >= 2)
dge <- dge[keep,]
Explore variance


hist(reads_sce$total_counts, breaks=100) # counts per sample
hist(reads_sce$total_features, breaks=100) # counts per miRNA
plotQC(reads_sce, type = "highest-expression")

plotQC(reads_sce, type="explanatory-variables", variables=c("Index", "Participant.ID", "Collection.Date", "Library.Generation.Set", "MiSeq.QC.Run", 'total_features', "Age", "Race", "Gender"))

Examine sources of variation
plotPCA(reads_sce, exprs_values = "logcounts", colour_by = "Library.Generation.Set", size_by = "total_features")
non-plotting arguments like 'exprs_values' should go in 'run_args'

plotPCA(reads_sce, exprs_values = "logcounts", colour_by = "Race", shape_by="Gender", size_by = "total_features")
non-plotting arguments like 'exprs_values' should go in 'run_args'

for (var in c("total_features", "Age", "Library.Generation.Set", "Index", "Participant.ID", "Race", "Gender", "Collection.Date")) {
print(
plotQC(reads_sce, type = "find-pcs", exprs_values = "logcounts", variable = var)
) + ggtitle(var)
}








Normalize & Remove unwanted sources of variation
library(RUVSeq)
library(ggplot2)
library(mvoutlier)
dge <- calcNormFactors(dge)
dge <- estimateGLMCommonDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)
fit <- glmFit(dge, design)
res <- residuals(fit, type="deviance")
set <- newSeqExpressionSet(dge$counts, phenoData=dge$samples)
ruvr_sets <- list()
for(k in 1:5) {
ruvr_sets[[k]] <- RUVr(set, row.names(dge), k=k, res)
assay(reads_sce, paste("RUVr k=", toString(k))) <- log2(t(t(assayData(ruvr_sets[[k]])$normalizedCounts) / colSums(assayData(ruvr_sets[[k]])$normalizedCounts) * 1e6) + 1)
}
for(n in assayNames(reads_sce)) {
print(
plotPCA(
reads_sce,
colour_by = "Library.Generation.Set",
size_by = "total_features",
exprs_values = n
) + ggtitle(paste("Batch",n))
)
print(
plotPCA(
reads_sce,
colour_by = "Race",
shape_by = "Gender",
size_by = "total_features",
exprs_values = n
) + ggtitle(paste("Demographics", n))
)
}
non-plotting arguments like 'exprs_values' should go in 'run_args'
















Detect outliers
reads_sce <- runPCA(reads_sce, use_coldata = TRUE, detect_outliers = TRUE)
failed to find 'pct_counts_feature_control' in column metadatafailed to find 'total_features_by_counts_feature_control' in column metadatafailed to find 'log10_total_counts_endogenous' in column metadatafailed to find 'log10_total_counts_feature_control' in column metadata
outliers <- colnames(reads_sce)[reads_sce$outlier]
head(outliers)
character(0)
Examine sources of variance after removing unwanted variation
for (var in c("total_features", "Age", "Library.Generation.Set", "Index", "Participant.ID", "Race", "Gender", "Collection.Date")) {
print(
plotQC(reads_sce, type = "find-pcs", exprs_values = "RUVr k= 2", variable = var)
) + ggtitle(var)
}








Visualize top highly-expressed miRNAs male-vs-female
library(RColorBrewer)
library(reshape2)
# Ryan's code, modified
norm.expr.matr <- exprs(reads_sce)
# Rank the mean expression values for plasma/serum miRs. Highest expression = 1
mean.expr.male.rank <- rank(-1*rowMeans(norm.expr.matr[, colData(reads_sce)$Gender=="Male"]))
mean.expr.female.rank <- rank(-1*rowMeans(norm.expr.matr[, colData(reads_sce)$Gender=="Female"]))
top_N <- 20
top.miRs <- row.names(norm.expr.matr)[mean.expr.male.rank <= top_N | mean.expr.female.rank <= top_N] # get the names of the top miRs in plasma or serum
norm.expr.top <- norm.expr.matr[top.miRs, ] # Get the expression matrix for the top mIRs
norm.expr.melt <- reshape2::melt(norm.expr.top) # Convert your normalized expression matrix to a 3 column data.frame (row.name, col.name, expression value). Melt is in the dpylr package, I believe.
colnames(norm.expr.melt) <- c("miR.ID", "MT.Unique.ID", "norm.expr") # just so it's easier for me to tell you which columns I'm using.
norm.expr.melt$Gender <- ""
for (row_num in 1:nrow(norm.expr.melt)){ # Pull the Gender values from the column metadata.
mt_unique_id <- norm.expr.melt[row_num, ]$MT.Unique.ID
norm.expr.melt[row_num, "Gender"] <- as.character(samples[mt_unique_id,"Gender"])
}
# This would be a simple boxplot with plasma/serum side-by-side. Overlaying the boxes over points takes a little more tweaking to get the dodge/width right, but it's doable.
ggplot(norm.expr.melt, aes(x=reorder(miR.ID, norm.expr, FUN=median), y=norm.expr, fill=Gender)) + geom_boxplot(pos="dodge", outlier.size=0.5) +
ggtitle("Top 20 Expressed miRNAs") + ylab("Normalized Expression") + xlab("miRNA ID") +
theme(panel.grid.major.x = element_line(size = 0.5, linetype = 'solid', colour = "grey"),
panel.grid.major.y = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(norm.expr.melt, aes(x=reorder(miR.ID, norm.expr, FUN=median), y=norm.expr, fill=Gender)) + geom_boxplot(pos="dodge", outlier.size=0.5) +
ggtitle("Top 20 Expressed miRNAs") + ylab("Normalized Expression") + xlab("miRNA ID") +
theme(panel.grid.major.y = element_line(size = 0.5, linetype = 'solid', colour = "grey"),
panel.grid.major.x = element_blank()) + coord_flip()

Visualize top highly-expressed miRNAs by Race
# Rank the mean expression values for plasma/serum miRs. Highest expression = 1
mean.expr.afr.rank <- rank(-1*rowMeans(norm.expr.matr[, colData(reads_sce)$Race=="African_American"]))
mean.expr.asn.rank <- rank(-1*rowMeans(norm.expr.matr[, colData(reads_sce)$Race=="Asian"]))
mean.expr.wht.rank <- rank(-1*rowMeans(norm.expr.matr[, colData(reads_sce)$Race=="White"]))
top_N <- 20
top.miRs <- row.names(norm.expr.matr)[mean.expr.afr.rank <= top_N | mean.expr.asn.rank <= top_N | mean.expr.wht.rank <= top_N] # get the names of the top miRs in plasma or serum
norm.expr.top <- norm.expr.matr[top.miRs, ] # Get the expression matrix for the top mIRs
norm.expr.melt <- reshape2::melt(norm.expr.top) # Convert your normalized expression matrix to a 3 column data.frame (row.name, col.name, expression value). Melt is in the dpylr package, I believe.
colnames(norm.expr.melt) <- c("miR.ID", "MT.Unique.ID", "norm.expr") # just so it's easier for me to tell you which columns I'm using.
norm.expr.melt$Race <- ""
for (row_num in 1:nrow(norm.expr.melt)){ # Pull the Gender values from the column metadata.
mt_unique_id <- norm.expr.melt[row_num, ]$MT.Unique.ID
norm.expr.melt[row_num, "Race"] <- as.character(samples[mt_unique_id,"Race"])
}
norm.expr.melt <- norm.expr.melt[norm.expr.melt$Race %in% c("African_American", "Asian", "White"),]
# This would be a simple boxplot with plasma/serum side-by-side. Overlaying the boxes over points takes a little more tweaking to get the dodge/width right, but it's doable.
ggplot(norm.expr.melt, aes(x=reorder(miR.ID, norm.expr, FUN=median), y=norm.expr, fill=Race)) + geom_boxplot(pos="dodge", outlier.size=0.5) +
ggtitle("Top 20 Expressed miRNAs") + ylab("Normalized Expression") + xlab("miRNA ID") +
theme(panel.grid.major.x = element_line(size = 0.5, linetype = 'solid', colour = "grey"),
panel.grid.major.y = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(norm.expr.melt, aes(x=reorder(miR.ID, norm.expr, FUN=median), y=norm.expr, fill=Race)) + geom_boxplot(pos="dodge", outlier.size=0.5) +
ggtitle("Top 20 Expressed miRNAs") + ylab("Normalized Expression") + xlab("miRNA ID") +
theme(panel.grid.major.y = element_line(size = 0.5, linetype = 'solid', colour = "grey"),
panel.grid.major.x = element_blank()) + coord_flip()

Heatmap of most highly expressed miRNAs
library(pheatmap)
ruvr2 <- ruvr_sets[[2]]
norm.expr.matr <- norm.expr.matr[order(rowMeans(norm.expr.matr), decreasing=TRUE),]
Demographics <- pData(ruvr2)[,c("Race", "Gender")]
annotations <- as.data.frame(Demographics)
rownames(annotations) <- rownames(pData(ruvr2))
pheatmap(norm.expr.matr[0:50,], cluster_rows=FALSE, show_rownames=TRUE, cluster_cols=TRUE, annotation_col=annotations, main="Top 50 Expressed miRNAs")

DE analysis with EdgeR
# based on Ryan's code, pass RUVSeq-corrected data to edgeR
ruvr2 <- ruvr_sets[[2]]
design <- model.matrix(~ 0 + Race + Gender + Age + W_1 + W_2, pData(ruvr2))
dge <- estimateDisp(dge, design = design, tagwise = TRUE, robust = TRUE)
fit <- glmFit(dge, design)
Contrast gender
lrt <- glmLRT(fit, coef = "GenderMale")
results <- data.frame(topTags(lrt, n=Inf, sort.by="PValue", p.value=0.05))
sig_miRs = list()
logFC_threshold <- 1
sig_miRs[[1]] <- rownames(results[results$PValue < 0.05 & abs(results$logFC) > logFC_threshold,]) # filter by p-value and logFC
sig_miRs[[2]] <- rownames(results[results$PValue < 0.05,])
# heatmaps of significant DE miRNAs
Gender <- pData(ruvr2)[,c("Gender")]
annotations <- as.data.frame(Gender)
rownames(annotations) <- rownames(pData(ruvr2))
for(miR_list in sig_miRs) {
pheatmap(norm.expr.matr[miR_list,], cluster_rows=TRUE, show_rownames=TRUE, cluster_cols=TRUE, annotation_col=annotations, main="Differentially Expressed miRNAs")
}


Contrast race
lrt <- glmLRT(fit, contrast=makeContrasts(asn=(RaceAsian-(RaceAfrican_American+RaceWhite)/2),
afr=(RaceAfrican_American-(RaceAsian+RaceWhite)/2),
wht=(RaceWhite-(RaceAfrican_American+RaceAsian)/2), levels=design))
results <- data.frame(topTags(lrt, n=Inf, sort.by="PValue", p.value=0.05))
sig_miRs = list()
logFC_threshold <- 1
sig_miRs[[1]] <- rownames(results[results$PValue < 0.05 & abs(results$logFC.asn) > logFC_threshold | abs(results$logFC.afr) > logFC_threshold | abs(results$logFC.wht) > logFC_threshold,]) # filter by p-value and logFC
sig_miRs[[2]] <- rownames(results[results$PValue < 0.05,])
# heatmaps of significant DE miRNAs
Demographics <- pData(ruvr2)[,c("Race", "Gender")]
annotations <- as.data.frame(Demographics)
rownames(annotations) <- rownames(pData(ruvr2))
for(miR_list in sig_miRs) {
pheatmap(norm.expr.matr[miR_list,], cluster_rows=TRUE, show_rownames=TRUE, cluster_cols=TRUE, annotation_col=annotations, main="Differentially Expressed miRNAs")
}


Create a PCA plot showing Age x Gender
plotPCA(
reads_sce,
colour_by = "Age",
shape_by = "Gender",
size_by = "total_features",
exprs_values = "RUVr k= 2"
) + ggtitle("RUVSeq-Normalized Expression (k=2)")
non-plotting arguments like 'exprs_values' should go in 'run_args'

save.image("diff_expr_plasma_only.RData")
---
title: "miRNA Differential expression analysis for plasma samples"
output: html_notebook
editor_options: 
  chunk_output_type: inline
---
# Plasma only samples
```{python3}
import pandas as pd
samples = pd.read_csv('data/sample_sheet.csv')
# must be Healthy Control Study and must pass MiSeq QC
samples = samples.loc[(samples['Study'] == 'Healthy Controls') & (samples['MISEQ.QC.PASS'] == 'PASS')]  
samples = samples.set_index('MT.Unique.ID').sort_values(by=['Participant.ID', 'Source'])
# capitalize & strip whitespace for consistency
for column in ['Gender', 'Race', 'Source']:
    samples[column] = samples[column].str.capitalize()
    samples[column] = samples[column].str.strip()
# use correct ontology terms
race_ontology = {'Asian': 'Asian',
 'Black or african american': 'African_American',
 'Mixed/asian & white': 'Multiracial',
 'Mixed/asian &black': 'Multiracial',
 'Mixed/black, white, asian': 'Multiracial',
 'Native hawiian or other pacific islander': 'Pacific_Islander',
 'Pacific islander': 'Pacific_Islander',
 'White': 'White'}
for id in samples.index:
    race = samples.at[id, 'Race']
    samples.at[id, 'Race'] = race_ontology[race] if race in race_ontology else 'Multiracial'
# get only the plasma samples
mir_counts = pd.read_csv("data/get_canonical/canon_mir_counts.csv", index_col=0)
samples.index = pd.Index(['X' + str(row) for row in samples.index])
plasma_samples = samples.loc[samples['Source'] == "Plasma"]
plasma_mir_counts = mir_counts[plasma_samples.index]
plasma_samples.to_csv('data/plasma_samples.csv')
plasma_mir_counts.to_csv('data/plasma_mir_counts.csv')
```

## Load the sample data and miRNA counts
```{r}
samples <- read.csv('data/plasma_samples.csv')
counts <- read.csv('data/plasma_mir_counts.csv')
samples <- subset(samples, Library.Generation.Set != "SetRecheck" )  # exclude SetRecheck samples
samples <- samples[samples$X != "X11" & samples$X != "X92",] # remove outliers
samples$Participant.ID <- factor(samples$Participant.ID)  # ID is categorical, not numerical
rownames(counts) = counts$X
rownames(samples) = samples$X
counts$X <- NULL  # remove extra column
samples$X <- NULL
counts <- counts[,unique(rownames(samples))]
head(samples)
head(counts)
```

## Filter
```{r}
library(edgeR)
design <- model.matrix(~0+Race+Gender+Age, samples)
dge = DGEList(counts = counts, samples = samples)
# require miRNAs to have CPM > 1 in at least 2 samples
countsPerMillion <- edgeR::cpm(dge)
countCheck <- countsPerMillion > 1
head(countCheck)
keep <- which(rowSums(countCheck) >= 2) 
dge <- dge[keep,]
```
## Explore variance
```{r}
library(SingleCellExperiment)
library(scater)
reads_sce <- SingleCellExperiment(assays=list(counts=dge$counts),  colData=dge$samples)
# remove unexpressed miRNAs
keep_feature <- rowSums(counts(reads_sce) > 0) > 0
reads_sce <- reads_sce[keep_feature, ]
reads_sce <- calculateQCMetrics(reads_sce)
# log transform
cpm(reads_sce) <- calculateCPM(reads_sce)
reads_sce <- normalize(reads_sce)
logcounts(reads_sce) <- log2(calculateCPM(reads_sce) + 1)
# visualize
hist(reads_sce$total_counts, breaks=100)  # counts per sample
hist(reads_sce$total_features, breaks=100)  # counts per miRNA
plotQC(reads_sce, type = "highest-expression")
plotQC(reads_sce, type="explanatory-variables", variables=c("Index", "Participant.ID", "Collection.Date", "Library.Generation.Set", "MiSeq.QC.Run", 'total_features', "Age", "Race", "Gender"))
```
## Examine sources of variation
```{r}
plotPCA(reads_sce, exprs_values = "logcounts", colour_by = "Library.Generation.Set", size_by = "total_features")
plotPCA(reads_sce, exprs_values = "logcounts", colour_by = "Race", shape_by="Gender", size_by = "total_features")
for (var in c("total_features", "Age", "Library.Generation.Set", "Index", "Participant.ID", "Race", "Gender", "Collection.Date")) {
  print(
    plotQC(reads_sce, type = "find-pcs", exprs_values = "logcounts", variable = var)
    )  + ggtitle(var) 
  }
```

## Normalize & Remove unwanted sources of variation
```{r}
library(RUVSeq)
library(ggplot2)
library(mvoutlier)
dge <- calcNormFactors(dge)
dge <- estimateGLMCommonDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)
fit <- glmFit(dge, design)
res <- residuals(fit, type="deviance")
set <- newSeqExpressionSet(dge$counts, phenoData=dge$samples)
ruvr_sets <- list()
for(k in 1:5) {
  ruvr_sets[[k]] <- RUVr(set, row.names(dge), k=k, res)
  assay(reads_sce, paste("RUVr k=", toString(k))) <- log2(t(t(assayData(ruvr_sets[[k]])$normalizedCounts) / colSums(assayData(ruvr_sets[[k]])$normalizedCounts) * 1e6) + 1)
}
for(n in assayNames(reads_sce)) {
  print(
        plotPCA(
            reads_sce,
            colour_by = "Library.Generation.Set",
            size_by = "total_features",
            exprs_values = n
        ) + ggtitle(paste("Batch",n))
  )
  print(
        plotPCA(
            reads_sce,
            colour_by = "Race",
            shape_by = "Gender",
            size_by = "total_features",
            exprs_values = n
        ) + ggtitle(paste("Demographics", n))        
  )
}
```

## Detect outliers
```{r}
reads_sce <- runPCA(reads_sce, use_coldata = TRUE, detect_outliers = TRUE)
outliers <- colnames(reads_sce)[reads_sce$outlier]
head(outliers)
```

## Examine sources of variance after removing unwanted variation
```{r}
for (var in c("total_features", "Age", "Library.Generation.Set", "Index", "Participant.ID", "Race", "Gender", "Collection.Date")) {
  print(
    plotQC(reads_sce, type = "find-pcs", exprs_values = "RUVr k= 2", variable = var)
    )  + ggtitle(var) 
}
```
## Visualize top highly-expressed miRNAs male-vs-female
```{r}
library(RColorBrewer)
library(reshape2)
# Ryan's code, modified
norm.expr.matr <- exprs(reads_sce)
# Rank the mean expression values for plasma/serum miRs. Highest expression = 1
mean.expr.male.rank <- rank(-1*rowMeans(norm.expr.matr[, colData(reads_sce)$Gender=="Male"]))
mean.expr.female.rank <- rank(-1*rowMeans(norm.expr.matr[, colData(reads_sce)$Gender=="Female"]))
top_N <- 20
top.miRs <- row.names(norm.expr.matr)[mean.expr.male.rank <= top_N | mean.expr.female.rank <= top_N] # get the names of the top miRs in plasma or serum
norm.expr.top <- norm.expr.matr[top.miRs, ] # Get the expression matrix for the top mIRs
norm.expr.melt <- reshape2::melt(norm.expr.top) # Convert your normalized expression matrix to a 3 column data.frame (row.name, col.name, expression value). Melt is in the dpylr package, I believe.
colnames(norm.expr.melt) <- c("miR.ID", "MT.Unique.ID", "norm.expr") # just so it's easier for me to tell you which columns I'm using.
norm.expr.melt$Gender <- ""
for (row_num in 1:nrow(norm.expr.melt)){  # Pull the Gender values from the column metadata.
  mt_unique_id <- norm.expr.melt[row_num, ]$MT.Unique.ID
  norm.expr.melt[row_num, "Gender"] <- as.character(samples[mt_unique_id,"Gender"])
}
# This would be a simple boxplot with plasma/serum side-by-side. Overlaying the boxes over points takes a little more tweaking to get the dodge/width right, but it's doable.
ggplot(norm.expr.melt, aes(x=reorder(miR.ID, norm.expr, FUN=median), y=norm.expr, fill=Gender)) + geom_boxplot(pos="dodge", outlier.size=0.5) + 
  ggtitle("Top 20 Expressed miRNAs") + ylab("Normalized Expression") + xlab("miRNA ID") + 
  theme(panel.grid.major.x = element_line(size = 0.5, linetype = 'solid', colour = "grey"), 
        panel.grid.major.y = element_blank(),
        axis.text.x = element_text(angle = 90, hjust = 1))
ggplot(norm.expr.melt, aes(x=reorder(miR.ID, norm.expr, FUN=median), y=norm.expr, fill=Gender)) + geom_boxplot(pos="dodge", outlier.size=0.5) + 
  ggtitle("Top 20 Expressed miRNAs") + ylab("Normalized Expression") + xlab("miRNA ID") + 
  theme(panel.grid.major.y = element_line(size = 0.5, linetype = 'solid', colour = "grey"), 
        panel.grid.major.x = element_blank()) + coord_flip()
```
## Visualize top highly-expressed miRNAs by Race
```{r}
# Rank the mean expression values for plasma/serum miRs. Highest expression = 1
mean.expr.afr.rank <- rank(-1*rowMeans(norm.expr.matr[, colData(reads_sce)$Race=="African_American"]))
mean.expr.asn.rank <- rank(-1*rowMeans(norm.expr.matr[, colData(reads_sce)$Race=="Asian"]))
mean.expr.wht.rank <- rank(-1*rowMeans(norm.expr.matr[, colData(reads_sce)$Race=="White"]))
top_N <- 20
top.miRs <- row.names(norm.expr.matr)[mean.expr.afr.rank <= top_N | mean.expr.asn.rank <= top_N | mean.expr.wht.rank <= top_N] # get the names of the top miRs in plasma or serum
norm.expr.top <- norm.expr.matr[top.miRs, ] # Get the expression matrix for the top mIRs
norm.expr.melt <- reshape2::melt(norm.expr.top) # Convert your normalized expression matrix to a 3 column data.frame (row.name, col.name, expression value). Melt is in the dpylr package, I believe.
colnames(norm.expr.melt) <- c("miR.ID", "MT.Unique.ID", "norm.expr") # just so it's easier for me to tell you which columns I'm using.
norm.expr.melt$Race <- ""
for (row_num in 1:nrow(norm.expr.melt)){  # Pull the Gender values from the column metadata.
  mt_unique_id <- norm.expr.melt[row_num, ]$MT.Unique.ID
  norm.expr.melt[row_num, "Race"] <- as.character(samples[mt_unique_id,"Race"])
}
norm.expr.melt <- norm.expr.melt[norm.expr.melt$Race %in% c("African_American", "Asian", "White"),]
# This would be a simple boxplot with plasma/serum side-by-side. Overlaying the boxes over points takes a little more tweaking to get the dodge/width right, but it's doable.
ggplot(norm.expr.melt, aes(x=reorder(miR.ID, norm.expr, FUN=median), y=norm.expr, fill=Race)) + geom_boxplot(pos="dodge", outlier.size=0.5) + 
  ggtitle("Top 20 Expressed miRNAs") + ylab("Normalized Expression") + xlab("miRNA ID") + 
  theme(panel.grid.major.x = element_line(size = 0.5, linetype = 'solid', colour = "grey"), 
        panel.grid.major.y = element_blank(),
        axis.text.x = element_text(angle = 90, hjust = 1))
ggplot(norm.expr.melt, aes(x=reorder(miR.ID, norm.expr, FUN=median), y=norm.expr, fill=Race)) + geom_boxplot(pos="dodge", outlier.size=0.5) + 
  ggtitle("Top 20 Expressed miRNAs") + ylab("Normalized Expression") + xlab("miRNA ID") + 
  theme(panel.grid.major.y = element_line(size = 0.5, linetype = 'solid', colour = "grey"), 
        panel.grid.major.x = element_blank()) + coord_flip()
```

## Heatmap of most highly expressed miRNAs
```{r}
library(pheatmap)
ruvr2 <- ruvr_sets[[2]]
norm.expr.matr <- norm.expr.matr[order(rowMeans(norm.expr.matr), decreasing=TRUE),]
Demographics <- pData(ruvr2)[,c("Race", "Gender")]
annotations <- as.data.frame(Demographics)
rownames(annotations) <- rownames(pData(ruvr2))
pheatmap(norm.expr.matr[0:50,], cluster_rows=FALSE, show_rownames=TRUE, cluster_cols=TRUE, annotation_col=annotations, main="Top 50 Expressed miRNAs")
```

# DE analysis with EdgeR
```{r}
# based on Ryan's code, pass RUVSeq-corrected data to edgeR
ruvr2 <- ruvr_sets[[2]]
design <- model.matrix(~ 0 + Race + Gender + Age + W_1 + W_2, pData(ruvr2))
dge <- estimateDisp(dge, design = design, tagwise = TRUE, robust = TRUE)
fit <- glmFit(dge, design)
```

## Contrast gender
```{r}
lrt <- glmLRT(fit, coef = "GenderMale")
results <- data.frame(topTags(lrt, n=Inf, sort.by="PValue", p.value=0.05))
sig_miRs = list()
logFC_threshold <- 1
sig_miRs[[1]] <- rownames(results[results$PValue < 0.05 & abs(results$logFC) > logFC_threshold,])  # filter by p-value and logFC
sig_miRs[[2]] <- rownames(results[results$PValue < 0.05,])
# heatmaps of significant DE miRNAs
Gender <- pData(ruvr2)[,c("Gender")]
annotations <- as.data.frame(Gender)
rownames(annotations) <- rownames(pData(ruvr2))
for(miR_list in sig_miRs) {
  pheatmap(norm.expr.matr[miR_list,], cluster_rows=TRUE, show_rownames=TRUE, cluster_cols=TRUE, annotation_col=annotations, main="Differentially Expressed miRNAs")
}
```

## Contrast race
```{r}
lrt <- glmLRT(fit, contrast=makeContrasts(asn=(RaceAsian-(RaceAfrican_American+RaceWhite)/2),
                                          afr=(RaceAfrican_American-(RaceAsian+RaceWhite)/2),
                                          wht=(RaceWhite-(RaceAfrican_American+RaceAsian)/2), levels=design))
results <- data.frame(topTags(lrt, n=Inf, sort.by="PValue", p.value=0.05))
sig_miRs = list()
logFC_threshold <- 1
sig_miRs[[1]] <- rownames(results[results$PValue < 0.05 & abs(results$logFC.asn) > logFC_threshold | abs(results$logFC.afr) > logFC_threshold | abs(results$logFC.wht) > logFC_threshold,])  # filter by p-value and logFC
sig_miRs[[2]] <- rownames(results[results$PValue < 0.05,])
# heatmaps of significant DE miRNAs
Demographics <- pData(ruvr2)[,c("Race", "Gender")]
annotations <- as.data.frame(Demographics)
rownames(annotations) <- rownames(pData(ruvr2))
for(miR_list in sig_miRs) {
  pheatmap(norm.expr.matr[miR_list,], cluster_rows=TRUE, show_rownames=TRUE, cluster_cols=TRUE, annotation_col=annotations, main="Differentially Expressed miRNAs")
}
```

## Create a PCA plot showing Age x Gender
```{r}
plotPCA(
            reads_sce,
            colour_by = "Age",
            shape_by = "Gender",
            size_by = "total_features",
            exprs_values = "RUVr k= 2"
) + ggtitle("RUVSeq-Normalized Expression (k=2)") 
```

```{r}
save.image("diff_expr_plasma_only.RData")
```